Origional Data¶

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
In [3]:
# Import dataset
df = pd.read_csv('Death.csv')
In [4]:
# Shape
df.shape
Out[4]:
(137700, 16)
In [5]:
# Columns
df.columns
Out[5]:
Index(['Data As Of', 'Start Date', 'End Date', 'Group', 'Year', 'Month',
       'State', 'Sex', 'Age Group', 'COVID-19 Deaths', 'Total Deaths',
       'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths', 'Influenza Deaths',
       'Pneumonia, Influenza, or COVID-19 Deaths', 'Footnote'],
      dtype='object')
In [6]:
# Head
df.head()
Out[6]:
Data As Of Start Date End Date Group Year Month State Sex Age Group COVID-19 Deaths Total Deaths Pneumonia Deaths Pneumonia and COVID-19 Deaths Influenza Deaths Pneumonia, Influenza, or COVID-19 Deaths Footnote
0 09/27/2023 01/01/2020 09/23/2023 By Total NaN NaN United States All Sexes All Ages 1146774.0 12303399.0 1162844.0 569264.0 22229.0 1760095.0 NaN
1 09/27/2023 01/01/2020 09/23/2023 By Total NaN NaN United States All Sexes Under 1 year 519.0 73213.0 1056.0 95.0 64.0 1541.0 NaN
2 09/27/2023 01/01/2020 09/23/2023 By Total NaN NaN United States All Sexes 0-17 years 1696.0 130970.0 2961.0 424.0 509.0 4716.0 NaN
3 09/27/2023 01/01/2020 09/23/2023 By Total NaN NaN United States All Sexes 1-4 years 285.0 14299.0 692.0 66.0 177.0 1079.0 NaN
4 09/27/2023 01/01/2020 09/23/2023 By Total NaN NaN United States All Sexes 5-14 years 509.0 22008.0 818.0 143.0 219.0 1390.0 NaN
In [7]:
# Information
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137700 entries, 0 to 137699
Data columns (total 16 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Data As Of                                137700 non-null  object 
 1   Start Date                                137700 non-null  object 
 2   End Date                                  137700 non-null  object 
 3   Group                                     137700 non-null  object 
 4   Year                                      134946 non-null  float64
 5   Month                                     123930 non-null  float64
 6   State                                     137700 non-null  object 
 7   Sex                                       137700 non-null  object 
 8   Age Group                                 137700 non-null  object 
 9   COVID-19 Deaths                           98270 non-null   float64
 10  Total Deaths                              118191 non-null  float64
 11  Pneumonia Deaths                          92836 non-null   float64
 12  Pneumonia and COVID-19 Deaths             100816 non-null  float64
 13  Influenza Deaths                          111012 non-null  float64
 14  Pneumonia, Influenza, or COVID-19 Deaths  93467 non-null   float64
 15  Footnote                                  97896 non-null   object 
dtypes: float64(8), object(8)
memory usage: 16.8+ MB
In [8]:
# Find all unique value for object columns
for column in df.columns:
  if df[column].dtype == object:
    print(f"Unique values for {column}: {df[column].unique()}")
Unique values for Data As Of: ['09/27/2023']
Unique values for Start Date: ['01/01/2020' '01/01/2021' '01/01/2022' '01/01/2023' '02/01/2020'
 '03/01/2020' '04/01/2020' '05/01/2020' '06/01/2020' '07/01/2020'
 '08/01/2020' '09/01/2020' '10/01/2020' '11/01/2020' '12/01/2020'
 '02/01/2021' '03/01/2021' '04/01/2021' '05/01/2021' '06/01/2021'
 '07/01/2021' '08/01/2021' '09/01/2021' '10/01/2021' '11/01/2021'
 '12/01/2021' '02/01/2022' '03/01/2022' '04/01/2022' '05/01/2022'
 '06/01/2022' '07/01/2022' '08/01/2022' '09/01/2022' '10/01/2022'
 '11/01/2022' '12/01/2022' '02/01/2023' '03/01/2023' '04/01/2023'
 '05/01/2023' '06/01/2023' '07/01/2023' '08/01/2023' '09/01/2023']
Unique values for End Date: ['09/23/2023' '12/31/2020' '12/31/2021' '12/31/2022' '01/31/2020'
 '02/29/2020' '03/31/2020' '04/30/2020' '05/31/2020' '06/30/2020'
 '07/31/2020' '08/31/2020' '09/30/2020' '10/31/2020' '11/30/2020'
 '01/31/2021' '02/28/2021' '03/31/2021' '04/30/2021' '05/31/2021'
 '06/30/2021' '07/31/2021' '08/31/2021' '09/30/2021' '10/31/2021'
 '11/30/2021' '01/31/2022' '02/28/2022' '03/31/2022' '04/30/2022'
 '05/31/2022' '06/30/2022' '07/31/2022' '08/31/2022' '09/30/2022'
 '10/31/2022' '11/30/2022' '01/31/2023' '02/28/2023' '03/31/2023'
 '04/30/2023' '05/31/2023' '06/30/2023' '07/31/2023' '08/31/2023']
Unique values for Group: ['By Total' 'By Year' 'By Month']
Unique values for State: ['United States' 'Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California'
 'Colorado' 'Connecticut' 'Delaware' 'District of Columbia' 'Florida'
 'Georgia' 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Iowa' 'Kansas'
 'Kentucky' 'Louisiana' 'Maine' 'Maryland' 'Massachusetts' 'Michigan'
 'Minnesota' 'Mississippi' 'Missouri' 'Montana' 'Nebraska' 'Nevada'
 'New Hampshire' 'New Jersey' 'New Mexico' 'New York' 'New York City'
 'North Carolina' 'North Dakota' 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania'
 'Rhode Island' 'South Carolina' 'South Dakota' 'Tennessee' 'Texas' 'Utah'
 'Vermont' 'Virginia' 'Washington' 'West Virginia' 'Wisconsin' 'Wyoming'
 'Puerto Rico']
Unique values for Sex: ['All Sexes' 'Male' 'Female']
Unique values for Age Group: ['All Ages' 'Under 1 year' '0-17 years' '1-4 years' '5-14 years'
 '15-24 years' '18-29 years' '25-34 years' '30-39 years' '35-44 years'
 '40-49 years' '45-54 years' '50-64 years' '55-64 years' '65-74 years'
 '75-84 years' '85 years and over']
Unique values for Footnote: [nan
 'One or more data cells have counts between 1-9 and have been suppressed in accordance with NCHS confidentiality standards.']
In [9]:
# Checking for missing data
df.isna().sum()
Out[9]:
Data As Of                                      0
Start Date                                      0
End Date                                        0
Group                                           0
Year                                         2754
Month                                       13770
State                                           0
Sex                                             0
Age Group                                       0
COVID-19 Deaths                             39430
Total Deaths                                19509
Pneumonia Deaths                            44864
Pneumonia and COVID-19 Deaths               36884
Influenza Deaths                            26688
Pneumonia, Influenza, or COVID-19 Deaths    44233
Footnote                                    39804
dtype: int64
In [10]:
# Drop useless columns
columns_to_drop = ["Data As Of", "Start Date", "End Date", "Footnote"]
df = df.drop(columns=columns_to_drop, errors='ignore')
In [11]:
df.head()
Out[11]:
Group Year Month State Sex Age Group COVID-19 Deaths Total Deaths Pneumonia Deaths Pneumonia and COVID-19 Deaths Influenza Deaths Pneumonia, Influenza, or COVID-19 Deaths
0 By Total NaN NaN United States All Sexes All Ages 1146774.0 12303399.0 1162844.0 569264.0 22229.0 1760095.0
1 By Total NaN NaN United States All Sexes Under 1 year 519.0 73213.0 1056.0 95.0 64.0 1541.0
2 By Total NaN NaN United States All Sexes 0-17 years 1696.0 130970.0 2961.0 424.0 509.0 4716.0
3 By Total NaN NaN United States All Sexes 1-4 years 285.0 14299.0 692.0 66.0 177.0 1079.0
4 By Total NaN NaN United States All Sexes 5-14 years 509.0 22008.0 818.0 143.0 219.0 1390.0
In [12]:
# For all coulmns that "Group" = "By Total", create a new df called df1
df1 = df[df['Group'] == 'By Total']
In [13]:
# Drop "Year", "Month" and "Group" in df1
df1 = df1.drop(['Year', 'Month', 'Group'], axis=1)
In [14]:
df1.head()
Out[14]:
State Sex Age Group COVID-19 Deaths Total Deaths Pneumonia Deaths Pneumonia and COVID-19 Deaths Influenza Deaths Pneumonia, Influenza, or COVID-19 Deaths
0 United States All Sexes All Ages 1146774.0 12303399.0 1162844.0 569264.0 22229.0 1760095.0
1 United States All Sexes Under 1 year 519.0 73213.0 1056.0 95.0 64.0 1541.0
2 United States All Sexes 0-17 years 1696.0 130970.0 2961.0 424.0 509.0 4716.0
3 United States All Sexes 1-4 years 285.0 14299.0 692.0 66.0 177.0 1079.0
4 United States All Sexes 5-14 years 509.0 22008.0 818.0 143.0 219.0 1390.0
In [15]:
# For all coulmns that "Group" = "By Year", create a new df called df2
df2 = df[df['Group'] == 'By Year']
In [16]:
# Drop "Month" and "Group" in df2
df2 = df2.drop(['Month', 'Group'], axis=1)
df2.head()
Out[16]:
Year State Sex Age Group COVID-19 Deaths Total Deaths Pneumonia Deaths Pneumonia and COVID-19 Deaths Influenza Deaths Pneumonia, Influenza, or COVID-19 Deaths
2754 2020.0 United States All Sexes All Ages 385666.0 3390039.0 352010.0 180086.0 8787.0 565226.0
2755 2020.0 United States All Sexes Under 1 year 52.0 19645.0 242.0 9.0 21.0 306.0
2756 2020.0 United States All Sexes 0-17 years 199.0 34204.0 629.0 36.0 179.0 971.0
2757 2020.0 United States All Sexes 1-4 years 25.0 3539.0 134.0 4.0 61.0 216.0
2758 2020.0 United States All Sexes 5-14 years 69.0 5644.0 173.0 12.0 76.0 306.0
In [17]:
# For all coulmns that "Group" = "By Month", create a new df called df3
df3 = df[df['Group'] == 'By Month']
In [18]:
# Drop "Group"
df3 = df3.drop('Group', axis=1)
In [19]:
df3.head()
Out[19]:
Year Month State Sex Age Group COVID-19 Deaths Total Deaths Pneumonia Deaths Pneumonia and COVID-19 Deaths Influenza Deaths Pneumonia, Influenza, or COVID-19 Deaths
13770 2020.0 1.0 United States All Sexes All Ages 6.0 264677.0 17909.0 3.0 2125.0 20037.0
13771 2020.0 1.0 United States All Sexes Under 1 year 0.0 1784.0 41.0 0.0 8.0 49.0
13772 2020.0 1.0 United States All Sexes 0-17 years 0.0 2966.0 90.0 0.0 63.0 153.0
13773 2020.0 1.0 United States All Sexes 1-4 years 0.0 315.0 22.0 0.0 18.0 40.0
13774 2020.0 1.0 United States All Sexes 5-14 years 0.0 471.0 21.0 0.0 29.0 50.0
In [20]:
# Remove rows with null values in df1, df2, and df3
df1 = df1.dropna()
df2 = df2.dropna()
df3 = df3.dropna()
In [21]:
# Check for null values in df1, df2, and df3
print("Null values in df1:\n", df1.isnull().sum())
print("\nNull values in df2:\n", df2.isnull().sum())
print("\nNull values in df3:\n", df3.isnull().sum())
Null values in df1:
 State                                       0
Sex                                         0
Age Group                                   0
COVID-19 Deaths                             0
Total Deaths                                0
Pneumonia Deaths                            0
Pneumonia and COVID-19 Deaths               0
Influenza Deaths                            0
Pneumonia, Influenza, or COVID-19 Deaths    0
dtype: int64

Null values in df2:
 Year                                        0
State                                       0
Sex                                         0
Age Group                                   0
COVID-19 Deaths                             0
Total Deaths                                0
Pneumonia Deaths                            0
Pneumonia and COVID-19 Deaths               0
Influenza Deaths                            0
Pneumonia, Influenza, or COVID-19 Deaths    0
dtype: int64

Null values in df3:
 Year                                        0
Month                                       0
State                                       0
Sex                                         0
Age Group                                   0
COVID-19 Deaths                             0
Total Deaths                                0
Pneumonia Deaths                            0
Pneumonia and COVID-19 Deaths               0
Influenza Deaths                            0
Pneumonia, Influenza, or COVID-19 Deaths    0
dtype: int64
In [22]:
# Check duplicate data
duplicates_df1 = df1[df1.duplicated()]

duplicates_df2 = df2[df2.duplicated()]

duplicates_df3 = df3[df3.duplicated()]

print("Duplicate rows in df1:\n", duplicates_df1)
print("\nDuplicate rows in df2:\n", duplicates_df2)
print("\nDuplicate rows in df3:\n", duplicates_df3)
Duplicate rows in df1:
 Empty DataFrame
Columns: [State, Sex, Age Group, COVID-19 Deaths, Total Deaths, Pneumonia Deaths, Pneumonia and COVID-19 Deaths, Influenza Deaths, Pneumonia, Influenza, or COVID-19 Deaths]
Index: []

Duplicate rows in df2:
 Empty DataFrame
Columns: [Year, State, Sex, Age Group, COVID-19 Deaths, Total Deaths, Pneumonia Deaths, Pneumonia and COVID-19 Deaths, Influenza Deaths, Pneumonia, Influenza, or COVID-19 Deaths]
Index: []

Duplicate rows in df3:
 Empty DataFrame
Columns: [Year, Month, State, Sex, Age Group, COVID-19 Deaths, Total Deaths, Pneumonia Deaths, Pneumonia and COVID-19 Deaths, Influenza Deaths, Pneumonia, Influenza, or COVID-19 Deaths]
Index: []

By total¶

In [24]:
# Remove "United States" in "State"

df1 = df1[df1['State'] != 'United States']
In [25]:
# Rename in State
us_state_abbrev = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
    'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
    'Florida': 'FL', 'Georgia': 'GA', 'Hawaii': 'HI', 'Idaho': 'ID',
    'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
    'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS',
    'Missouri': 'MO', 'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV',
    'New Hampshire': 'NH', 'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY',
    'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK',
    'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
    'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV',
    'Wisconsin': 'WI', 'Wyoming': 'WY'
}

df1['State'] = df1['State'].map(us_state_abbrev)
In [26]:
df1.head()
Out[26]:
State Sex Age Group COVID-19 Deaths Total Deaths Pneumonia Deaths Pneumonia and COVID-19 Deaths Influenza Deaths Pneumonia, Influenza, or COVID-19 Deaths
51 AL All Sexes All Ages 21520.0 231602.0 17619.0 7411.0 356.0 32038.0
59 AL All Sexes 30-39 years 416.0 6827.0 319.0 147.0 13.0 599.0
60 AL All Sexes 35-44 years 670.0 8639.0 468.0 231.0 16.0 921.0
61 AL All Sexes 40-49 years 1053.0 11224.0 748.0 359.0 14.0 1455.0
62 AL All Sexes 45-54 years 1628.0 15413.0 1114.0 557.0 23.0 2206.0
In [27]:
# Remove "All Sexes" in "Sex" and "All Ages" in "Age Group"
df1 = df1[df1['Sex'] != 'All Sexes']
df1 = df1[df1['Age Group'] != 'All Ages']
In [28]:
# State With Highest Death
death_categories_top5 = ['COVID-19 Deaths', 'Total Deaths', 'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths', 'Influenza Deaths','Pneumonia, Influenza, or COVID-19 Deaths']

state_death_summary_top5 = df1.groupby('State')[death_categories_top5].sum().reset_index()

ranked_states = {}

for category in death_categories_top5 + ['Total Deaths']:
    ranked_states[category] = state_death_summary_top5[['State', category]] \
        .sort_values(by=category, ascending=False) \
        .head(5)

for category, data in ranked_states.items():
    print(f"Top 5 States for {category}:")
    print(data)
    print()
Top 5 States for COVID-19 Deaths:
   State  COVID-19 Deaths
4     CA         139640.0
42    TX         137829.0
8     FL         100759.0
37    PA          61219.0
34    OH          58631.0

Top 5 States for Total Deaths:
   State  Total Deaths
4     CA     1472026.0
42    TX     1191557.0
8     FL     1114690.0
37    PA      633413.0
34    OH      620127.0

Top 5 States for Pneumonia Deaths:
   State  Pneumonia Deaths
4     CA          155609.0
42    TX          134735.0
8     FL          118012.0
34    OH           54056.0
37    PA           53160.0

Top 5 States for Pneumonia and COVID-19 Deaths:
   State  Pneumonia and COVID-19 Deaths
4     CA                        79958.0
42    TX                        77169.0
8     FL                        62463.0
34    OH                        28950.0
37    PA                        27002.0

Top 5 States for Influenza Deaths:
   State  Influenza Deaths
4     CA            2316.0
42    TX            2079.0
8     FL            1568.0
37    PA            1125.0
34    OH            1082.0

Top 5 States for Pneumonia, Influenza, or COVID-19 Deaths:
   State  Pneumonia, Influenza, or COVID-19 Deaths
4     CA                                  217494.0
42    TX                                  197226.0
8     FL                                  157738.0
37    PA                                   88401.0
34    OH                                   84764.0

In [29]:
# Draw map
import plotly.express as px

fig1 = px.choropleth(df1,
                    locations='State',
                    locationmode="USA-states",
                    color='Total Deaths',
                    scope="usa",
                    color_continuous_scale="reds",
                    title='COVID-19 Total Deaths by State')

fig2 = px.choropleth(df1,
                    locations='State',
                    locationmode="USA-states",
                    color='COVID-19 Deaths',
                    scope="usa",
                    color_continuous_scale="reds",
                    title='COVID-19 Deaths by State')

fig3 = px.choropleth(df1,
                    locations='State',
                    locationmode="USA-states",
                    color='Pneumonia Deaths',
                    scope="usa",
                    color_continuous_scale="reds",
                    title='Pneumonia Deaths by State')

fig4 = px.choropleth(df1,
                    locations='State',
                    locationmode="USA-states",
                    color='Pneumonia and COVID-19 Deaths',
                    scope="usa",
                    color_continuous_scale="reds",
                    title='Pneumonia and COVID-19 Deaths by State')


fig5 = px.choropleth(df1,
                    locations='State',
                    locationmode="USA-states",
                    color='Influenza Deaths',
                    scope="usa",
                    color_continuous_scale="reds",
                    title='Influenza Deaths by State')


fig6 = px.choropleth(df1,
                    locations='State',
                    locationmode="USA-states",
                    color='Pneumonia, Influenza, or COVID-19 Deaths',
                    scope="usa",
                    color_continuous_scale="reds",
                    title='Pneumonia, Influenza, or COVID-19 Deaths by State')


fig1.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()

By Year¶

In [31]:
# Remove "All Sexes" in "Sex" and "All Ages" in "Age Group" in df2
df2 = df2[df2['Sex'] != 'All Sexes']
df2 = df2[df2['Age Group'] != 'All Ages']
In [32]:
# Death by Sex
death_categories = ['COVID-19 Deaths', 'Total Deaths', 'Pneumonia Deaths',
                         'Pneumonia and COVID-19 Deaths', 'Influenza Deaths',
                         'Pneumonia, Influenza, or COVID-19 Deaths']

sex_death_summary = df2.groupby('Sex')[death_categories].sum().reset_index()

ranked_sex = {}

for category in death_categories:
    ranked_sex[category] = sex_death_summary[['Sex', category]] \
        .sort_values(by=category, ascending=False)

for category, data in ranked_sex.items():
    print(f"Ranking for {category} by sex:")
    print(data)
    print()
Ranking for COVID-19 Deaths by sex:
      Sex  COVID-19 Deaths
1    Male        1290612.0
0  Female        1013424.0

Ranking for Total Deaths by sex:
      Sex  Total Deaths
1    Male    13414684.0
0  Female    11471872.0

Ranking for Pneumonia Deaths by sex:
      Sex  Pneumonia Deaths
1    Male         1307286.0
0  Female         1024331.0

Ranking for Pneumonia and COVID-19 Deaths by sex:
      Sex  Pneumonia and COVID-19 Deaths
1    Male                       667663.0
0  Female                       479919.0

Ranking for Influenza Deaths by sex:
      Sex  Influenza Deaths
0  Female           25070.0
1    Male           24746.0

Ranking for Pneumonia, Influenza, or COVID-19 Deaths by sex:
      Sex  Pneumonia, Influenza, or COVID-19 Deaths
1    Male                                 1951649.0
0  Female                                 1580457.0

In [33]:
# Draw Column Histogram for Sex
df_sex_category = df2.groupby('Sex')[['COVID-19 Deaths', 'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths','Influenza Deaths', 'Pneumonia, Influenza, or COVID-19 Deaths']].sum()


categories = df_sex_category.columns
x = np.arange(len(categories))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))

ax.bar(x - width/2, df_sex_category.loc['Male'], width, label='Male', color='blue')
ax.bar(x + width/2, df_sex_category.loc['Female'], width, label='Female', color='orange')

ax.set_xlabel('Death Category')
ax.set_ylabel('Total Death Counts')
ax.set_title('Total Deaths by Category and Sex')
ax.set_xticks(x)
ax.set_xticklabels(categories, rotation=45, ha='right')
ax.legend()

plt.tight_layout()
plt.show()
No description has been provided for this image
In [34]:
# Unique value in "Age Group"
print(df1['Age Group'].unique())
['45-54 years' '50-64 years' '55-64 years' '65-74 years' '75-84 years'
 '85 years and over' '5-14 years' '35-44 years' '40-49 years' '0-17 years'
 '18-29 years' '25-34 years' '30-39 years' '15-24 years' 'Under 1 year'
 '1-4 years']
In [35]:
# Death by Age
death_categories = ['COVID-19 Deaths', 'Total Deaths', 'Pneumonia Deaths',
                         'Pneumonia and COVID-19 Deaths', 'Influenza Deaths',
                         'Pneumonia, Influenza, or COVID-19 Deaths']

age_death_summary = df2.groupby('Age Group')[death_categories].sum().reset_index()

ranked_age = {}

for category in death_categories:
    ranked_age[category] = age_death_summary[['Age Group', category]] \
        .sort_values(by=category, ascending=False).head(5)

for category, data in ranked_age.items():
    print(f"Ranking for {category} by age:")
    print(data)
    print()
Ranking for COVID-19 Deaths by age:
            Age Group  COVID-19 Deaths
14  85 years and over         539510.0
13        75-84 years         499796.0
12        65-74 years         417231.0
10        50-64 years         324039.0
11        55-64 years         248667.0

Ranking for Total Deaths by age:
            Age Group  Total Deaths
14  85 years and over     6045580.0
13        75-84 years     5157094.0
12        65-74 years     4135015.0
10        50-64 years     3324078.0
11        55-64 years     2538614.0

Ranking for Pneumonia Deaths by age:
            Age Group  Pneumonia Deaths
13        75-84 years          535288.0
14  85 years and over          521898.0
12        65-74 years          447444.0
10        50-64 years          326290.0
11        55-64 years          253520.0

Ranking for Pneumonia and COVID-19 Deaths by age:
            Age Group  Pneumonia and COVID-19 Deaths
13        75-84 years                       249998.0
12        65-74 years                       224917.0
14  85 years and over                       223667.0
10        50-64 years                       173917.0
11        55-64 years                       133561.0

Ranking for Influenza Deaths by age:
            Age Group  Influenza Deaths
14  85 years and over           10039.0
13        75-84 years           10014.0
12        65-74 years            8513.0
10        50-64 years            7935.0
11        55-64 years            6065.0

Ranking for Pneumonia, Influenza, or COVID-19 Deaths by age:
            Age Group  Pneumonia, Influenza, or COVID-19 Deaths
14  85 years and over                                  846767.0
13        75-84 years                                  793993.0
12        65-74 years                                  647175.0
10        50-64 years                                  483345.0
11        55-64 years                                  373925.0

In [36]:
# Draw Column Histogram for Age
df_age_group = df2.groupby('Age Group')[['COVID-19 Deaths', 'Pneumonia Deaths',
                                         'Pneumonia and COVID-19 Deaths', 'Influenza Deaths',
                                         'Pneumonia, Influenza, or COVID-19 Deaths']].sum()

age_order = ['Under 1 year', '1-4 years', '5-14 years', '15-24 years', '18-29 years',
             '25-34 years', '30-39 years', '35-44 years', '40-49 years',
             '45-54 years', '50-64 years', '55-64 years', '65-74 years',
             '75-84 years', '85 years and over']

df_age_group = df_age_group.reindex(age_order)

death_categories = ['COVID-19 Deaths', 'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths',
                    'Influenza Deaths', 'Pneumonia, Influenza, or COVID-19 Deaths']
colors = ['blue', 'orange', 'green', 'red', 'purple']

x = np.arange(len(age_order))
width = 0.15

plt.figure(figsize=(15, 8))

for i, (category, color) in enumerate(zip(death_categories, colors)):
    plt.bar(x + i * width, df_age_group[category], width, label=category, color=color)

plt.title('Death Counts by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Death Counts')
plt.xticks(x + width * 2, age_order, rotation=45, ha='right')
plt.legend()
plt.grid(True, axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()
No description has been provided for this image

By Month¶

In [38]:
# Unique value
unique_years_df3 = df3['Year'].unique()
unique_months_df3 = df3['Month'].unique()

print("Unique years in df3:", unique_years_df3)
print("Unique months in df3:", unique_months_df3)
Unique years in df3: [2020. 2021. 2022. 2023.]
Unique months in df3: [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12.]
In [39]:
# Create Date
df3['Year'] = df3['Year'].astype(int)
df3['Month'] = df3['Month'].astype(int)

df3['Date'] = pd.to_datetime(df3['Year'].astype(str) + '-' + df3['Month'].astype(str), format='%Y-%m')
In [40]:
# Month with highest Death
death_categories_top6 = ['COVID-19 Deaths', 'Total Deaths', 'Pneumonia Deaths',
                         'Pneumonia and COVID-19 Deaths', 'Influenza Deaths',
                         'Pneumonia, Influenza, or COVID-19 Deaths']

ranked_dates = {}

for category in death_categories_top6:
    ranked_dates[category] = df3[['Date', category]] \
        .sort_values(by=category, ascending=False) \
        .head(5)

for category, data in ranked_dates.items():
    print(f"Top 5 Dates for {category}:")
    print(data)
    print()
Top 5 Dates for COVID-19 Deaths:
            Date  COVID-19 Deaths
14382 2021-01-01         105565.0
14331 2020-12-01          98174.0
14994 2022-01-01          84011.0
13923 2020-04-01          65550.0
14790 2021-09-01          63444.0

Top 5 Dates for Total Deaths:
            Date  Total Deaths
14382 2021-01-01      373641.0
14994 2022-01-01      370245.0
14331 2020-12-01      367203.0
13923 2020-04-01      322414.0
14943 2021-12-01      320036.0

Top 5 Dates for Pneumonia Deaths:
            Date  Pneumonia Deaths
14382 2021-01-01           69849.0
14331 2020-12-01           62916.0
14994 2022-01-01           59484.0
14790 2021-09-01           51085.0
13923 2020-04-01           46427.0

Top 5 Dates for Pneumonia and COVID-19 Deaths:
            Date  Pneumonia and COVID-19 Deaths
14382 2021-01-01                        55416.0
14331 2020-12-01                        48324.0
14994 2022-01-01                        43699.0
14790 2021-09-01                        38294.0
14399 2021-01-01                        32387.0

Top 5 Dates for Influenza Deaths:
            Date  Influenza Deaths
15555 2022-12-01            4460.0
13872 2020-03-01            2437.0
15589 2022-12-01            2411.0
13821 2020-02-01            2373.0
15606 2023-01-01            2238.0

Top 5 Dates for Pneumonia, Influenza, or COVID-19 Deaths:
            Date  Pneumonia, Influenza, or COVID-19 Deaths
14382 2021-01-01                                  120079.0
14331 2020-12-01                                  112842.0
14994 2022-01-01                                  100272.0
13923 2020-04-01                                   84003.0
14790 2021-09-01                                   76294.0

In [41]:
# Total Death through Time
death_categories = ['COVID-19 Deaths', 'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths','Influenza Deaths', 'Pneumonia, Influenza, or COVID-19 Deaths']

df_grouped = df3.groupby(['Date', 'Sex'])['Total Deaths'].sum().reset_index()

plt.figure(figsize=(12, 6))

for sex, color in zip(['Male', 'Female'], ['blue', 'orange']):
    df_subset = df_grouped[df_grouped['Sex'] == sex]
    plt.plot(df_subset['Date'], df_subset['Total Deaths'], label=sex, color=color, linewidth=2)

plt.title('Total Deaths by Sex Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [42]:
# Death for five reason of death through time
death_categories = ['COVID-19 Deaths', 'Pneumonia Deaths', 'Pneumonia and COVID-19 Deaths','Influenza Deaths', 'Pneumonia, Influenza, or COVID-19 Deaths']

plt.figure(figsize=(15, 15))

for i, category in enumerate(death_categories, 1):
    plt.subplot(len(death_categories), 1, i)

    df_grouped = df3.groupby(['Date', 'Sex'])[category].sum().reset_index()

    for sex, color in zip(['Male', 'Female'], ['blue', 'orange']):
        df_subset = df_grouped[df_grouped['Sex'] == sex]
        plt.plot(df_subset['Date'], df_subset[category], label=sex, color=color, linewidth=2)

    plt.title(f'{category} by Sex Over Time')
    plt.xlabel('Date')
    plt.ylabel(category)
    plt.legend()
    plt.grid(True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [43]:
df3.head()
Out[43]:
Year Month State Sex Age Group COVID-19 Deaths Total Deaths Pneumonia Deaths Pneumonia and COVID-19 Deaths Influenza Deaths Pneumonia, Influenza, or COVID-19 Deaths Date
13770 2020 1 United States All Sexes All Ages 6.0 264677.0 17909.0 3.0 2125.0 20037.0 2020-01-01
13771 2020 1 United States All Sexes Under 1 year 0.0 1784.0 41.0 0.0 8.0 49.0 2020-01-01
13772 2020 1 United States All Sexes 0-17 years 0.0 2966.0 90.0 0.0 63.0 153.0 2020-01-01
13773 2020 1 United States All Sexes 1-4 years 0.0 315.0 22.0 0.0 18.0 40.0 2020-01-01
13774 2020 1 United States All Sexes 5-14 years 0.0 471.0 21.0 0.0 29.0 50.0 2020-01-01
In [44]:
# Remove Year and Month in df3
df3 = df3.drop(['Year', 'Month'], axis=1)
df3.head()
Out[44]:
State Sex Age Group COVID-19 Deaths Total Deaths Pneumonia Deaths Pneumonia and COVID-19 Deaths Influenza Deaths Pneumonia, Influenza, or COVID-19 Deaths Date
13770 United States All Sexes All Ages 6.0 264677.0 17909.0 3.0 2125.0 20037.0 2020-01-01
13771 United States All Sexes Under 1 year 0.0 1784.0 41.0 0.0 8.0 49.0 2020-01-01
13772 United States All Sexes 0-17 years 0.0 2966.0 90.0 0.0 63.0 153.0 2020-01-01
13773 United States All Sexes 1-4 years 0.0 315.0 22.0 0.0 18.0 40.0 2020-01-01
13774 United States All Sexes 5-14 years 0.0 471.0 21.0 0.0 29.0 50.0 2020-01-01
In [115]:
df4 = df3.copy()
df4 = df4[~((df4['State'] == 'United States') | (df4['Sex'] == 'All Sexes') | (df4['Age Group'] == 'All Ages'))]
df4.head()
Out[115]:
State Sex Age Group COVID-19 Deaths Total Deaths Pneumonia Deaths Pneumonia and COVID-19 Deaths Influenza Deaths Pneumonia, Influenza, or COVID-19 Deaths Date
16083 Alabama Male Under 1 year 0.0 17.0 0.0 0.0 0.0 0.0 2020-01-01
16086 Alabama Male 5-14 years 0.0 11.0 0.0 0.0 0.0 0.0 2020-01-01
16088 Alabama Male 18-29 years 0.0 53.0 0.0 0.0 0.0 0.0 2020-01-01
16100 Alabama Female Under 1 year 0.0 14.0 0.0 0.0 0.0 0.0 2020-01-01
16104 Alabama Female 15-24 years 0.0 13.0 0.0 0.0 0.0 0.0 2020-01-01
In [117]:
# One hot encoding to all object columns in df3
df4 = pd.get_dummies(df4, columns=df3.select_dtypes(include=['object']).columns, dummy_na=False)

df4.head()
Out[117]:
COVID-19 Deaths Total Deaths Pneumonia Deaths Pneumonia and COVID-19 Deaths Influenza Deaths Pneumonia, Influenza, or COVID-19 Deaths Date State_Alabama State_Alaska State_Arizona ... Age Group_35-44 years Age Group_40-49 years Age Group_45-54 years Age Group_5-14 years Age Group_50-64 years Age Group_55-64 years Age Group_65-74 years Age Group_75-84 years Age Group_85 years and over Age Group_Under 1 year
16083 0.0 17.0 0.0 0.0 0.0 0.0 2020-01-01 True False False ... False False False False False False False False False True
16086 0.0 11.0 0.0 0.0 0.0 0.0 2020-01-01 True False False ... False False False True False False False False False False
16088 0.0 53.0 0.0 0.0 0.0 0.0 2020-01-01 True False False ... False False False False False False False False False False
16100 0.0 14.0 0.0 0.0 0.0 0.0 2020-01-01 True False False ... False False False False False False False False False True
16104 0.0 13.0 0.0 0.0 0.0 0.0 2020-01-01 True False False ... False False False False False False False False False False

5 rows × 78 columns

In [119]:
df4.dtypes
Out[119]:
COVID-19 Deaths                  float64
Total Deaths                     float64
Pneumonia Deaths                 float64
Pneumonia and COVID-19 Deaths    float64
Influenza Deaths                 float64
                                  ...   
Age Group_55-64 years               bool
Age Group_65-74 years               bool
Age Group_75-84 years               bool
Age Group_85 years and over         bool
Age Group_Under 1 year              bool
Length: 78, dtype: object
In [121]:
df5 = df4.copy()
In [49]:
# Change bool to float
df4['Date'] = df4['Date'].astype('int64')
df4 = df4.astype(float)

print(df4.dtypes)
COVID-19 Deaths                  float64
Total Deaths                     float64
Pneumonia Deaths                 float64
Pneumonia and COVID-19 Deaths    float64
Influenza Deaths                 float64
                                  ...   
Age Group_55-64 years            float64
Age Group_65-74 years            float64
Age Group_75-84 years            float64
Age Group_85 years and over      float64
Age Group_Under 1 year           float64
Length: 78, dtype: object
In [50]:
# Heatmap
corr_matrix = df4.corr()

sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')

plt.show()
No description has been provided for this image
In [51]:
# Corr_matrix with highest relation
corr_matrix = df4.corr()

max_corr_values = {}

for col in corr_matrix.columns:
    max_corr_values[col] = corr_matrix[col].drop(col).max()

sorted_corr = pd.Series(max_corr_values).sort_values(ascending=False).head(20)

print(sorted_corr)
COVID-19 Deaths                             0.972556
Pneumonia, Influenza, or COVID-19 Deaths    0.972556
Pneumonia Deaths                            0.971805
Pneumonia and COVID-19 Deaths               0.966959
Total Deaths                                0.863321
Age Group_85 years and over                 0.378147
State_California                            0.374744
Influenza Deaths                            0.362005
Age Group_75-84 years                       0.293873
Age Group_65-74 years                       0.213524
State_Florida                               0.178320
State_Texas                                 0.170845
Age Group_50-64 years                       0.153834
State_New York City                         0.126236
State_Pennsylvania                          0.118171
State_Ohio                                  0.114080
Age Group_55-64 years                       0.106572
State_Illinois                              0.097415
State_Vermont                               0.087898
Age Group_5-14 years                        0.087898
dtype: float64
In [52]:
# Heatmap with high relation columns
corr_matrix = df4.corr()

max_corr_values = {}

for col in corr_matrix.columns:
    max_corr_values[col] = corr_matrix[col].drop(col).max()

sorted_cols = pd.Series(max_corr_values).sort_values(ascending=False).head(20).index.tolist()

filtered_corr_matrix = corr_matrix.loc[sorted_cols, sorted_cols]

sns.heatmap(filtered_corr_matrix, annot=False, cmap='coolwarm', cbar=True)

plt.show()
No description has been provided for this image

Modeling¶

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.metrics import roc_curve, auc, precision_recall_curve
In [70]:
#set x, y and split to train and test
X = df4.drop('COVID-19 Deaths', axis=1)
y = df4['COVID-19 Deaths']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [72]:
target_size = 100000  
current_size = len(X)

num_to_generate = target_size - current_size

def generate_interpolated_samples(X, y, num_samples):
    interpolated_X = []
    interpolated_y = []
    
    for _ in range(num_samples):
        idx1, idx2 = np.random.choice(len(X), size=2, replace=False)
        
        alpha = np.random.rand()
        
        new_sample_X = alpha * X[idx1] + (1 - alpha) * X[idx2]
        new_sample_y = alpha * y[idx1] + (1 - alpha) * y[idx2]
        
        interpolated_X.append(new_sample_X)
        interpolated_y.append(new_sample_y)
    
    return np.array(interpolated_X), np.array(interpolated_y)

def add_random_noise(X, y, num_samples, noise_level=0.01):
    noisy_X = []
    noisy_y = []
    
    for _ in range(num_samples):
        idx = np.random.choice(len(X))
        
        new_sample_X = X[idx] + np.random.normal(0, noise_level, X.shape[1])
        
        new_sample_y = y[idx] + np.random.normal(0, noise_level)
        
        noisy_X.append(new_sample_X)
        noisy_y.append(new_sample_y)
    
    return np.array(noisy_X), np.array(noisy_y)

num_interpolated = num_to_generate // 2
num_noisy = num_to_generate - num_interpolated

X_values = X.values
y_values = y.values

X_interpolated, y_interpolated = generate_interpolated_samples(X_values, y_values, num_interpolated)

X_noisy, y_noisy = add_random_noise(X_values, y_values, num_noisy)

X_augmented = np.vstack([X_values, X_interpolated, X_noisy])
y_augmented = np.hstack([y_values, y_interpolated, y_noisy])

X_augmented = pd.DataFrame(X_augmented, columns=X.columns)
y_augmented = pd.Series(y_augmented, name='target')

X_train, X_test, y_train, y_test = train_test_split(X_augmented, y_augmented, test_size=0.2, random_state=42)
In [73]:
#linear regression model
model = LinearRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
Mean Squared Error: 82.42049762225037
R-squared: 0.9891601928591927
In [76]:
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
Mean Absolute Error: 4.715865285080694
In [78]:
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

import seaborn as sns

plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [82]:
from sklearn.model_selection import cross_val_score

cv_mse = -cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
cv_r2 = cross_val_score(model, X_train, y_train, scoring='r2', cv=5)

print(f"Cross-Validated MSE: {cv_mse.mean()}")
print(f"Cross-Validated R²: {cv_r2.mean()}")
Cross-Validated MSE: 99.77869593375976
Cross-Validated R²: 0.9867886919625475
In [84]:
#L1 and L2
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_val_score

ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.01)

ridge_mse = -cross_val_score(ridge, X_train, y_train, scoring='neg_mean_squared_error', cv=5).mean()
lasso_mse = -cross_val_score(lasso, X_train, y_train, scoring='neg_mean_squared_error', cv=5).mean()

print(f"Cross-Validated MSE (Ridge): {ridge_mse}")
print(f"Cross-Validated MSE (Lasso): {lasso_mse}")
C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning:

Ill-conditioned matrix (rcond=6.1286e-38): result may not be accurate.

C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning:

Ill-conditioned matrix (rcond=6.14396e-38): result may not be accurate.

C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning:

Ill-conditioned matrix (rcond=6.01141e-38): result may not be accurate.

C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning:

Ill-conditioned matrix (rcond=6.09296e-38): result may not be accurate.

C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_ridge.py:216: LinAlgWarning:

Ill-conditioned matrix (rcond=6.19471e-38): result may not be accurate.

C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:697: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.987e+04, tolerance: 4.807e+04

C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:697: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.264e+04, tolerance: 4.594e+04

C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:697: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.015e+04, tolerance: 4.896e+04

C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:697: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.196e+04, tolerance: 4.637e+04

Cross-Validated MSE (Ridge): 1.510271994860769
Cross-Validated MSE (Lasso): 1.5358253069445076
C:\Users\chaoh\anaconda3\Lib\site-packages\sklearn\linear_model\_coordinate_descent.py:697: ConvergenceWarning:

Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.934e+04, tolerance: 4.860e+04

In [85]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
In [86]:
#  L2/Ridge regression
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

param_grid = {'ridge__alpha': [0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

best_alpha = grid_search.best_params_['ridge__alpha']
best_cv_mse = -grid_search.best_score_

print(f"Best Alpha: {best_alpha}")

best_model = grid_search.best_estimator_

best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (Ridge): {mse}")
print(f"R-squared (Ridge): {r2}")

cv_mse = -cross_val_score(best_model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)
cv_r2 = cross_val_score(best_model, X_train, y_train, scoring='r2', cv=5)

print(f"Cross-Validated MSE (Ridge): {np.mean(cv_mse)}")
print(f"Cross-Validated R² (Ridge): {np.mean(cv_r2)}")
Best Alpha: 0.1
Mean Squared Error (Ridge): 1.520949127860919
R-squared (Ridge): 0.9997999672934207
Cross-Validated MSE (Ridge): 1.51030214941287
Cross-Validated R² (Ridge): 0.9997929112941932
In [6]:
from math import sqrt
print(f"RMSE (Ridge): {sqrt(1.520949127860919)}")
RMSE (Ridge): 1.233267662699756
In [87]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import learning_curve
import numpy as np
import matplotlib.pyplot as plt

ridge_model = Ridge(alpha=0.1, random_state=42)

train_sizes, train_scores, validation_scores = learning_curve(
    estimator=ridge_model, 
    X=X_train, 
    y=y_train, 
    cv=5, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1 
)

train_scores_mean = -np.mean(train_scores, axis=1)  
validation_scores_mean = -np.mean(validation_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, label="Training Error", color="blue", marker="o")
plt.plot(train_sizes, validation_scores_mean, label="Validation Error", color="green", marker="s")
plt.title("Learning Curve for Ridge Regression")
plt.xlabel("Training Set Size")
plt.ylabel("Mean Squared Error")
plt.legend()
plt.grid()
plt.show()
No description has been provided for this image
In [88]:
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [89]:
# Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
Mean Squared Error: 8.791593286627121
R-squared: 0.9988437442330884
Mean Absolute Error: 0.7599344260619
In [10]:
print(f"RMSE (RF): {sqrt(8.791593286627121)}")
RMSE (RF): 2.965062105020251
In [90]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validated MSE: {-cv_scores.mean()}")
Cross-validated MSE: 13.042910890148272
In [91]:
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

import seaborn as sns

plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [92]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import randint, uniform

param_dist = {
    'n_estimators': [100, 200, 300, 500],  
    'max_depth': [10, 20, 30, None],  
    'min_samples_split': randint(2, 15),  
    'min_samples_leaf': randint(1, 10), 
    'max_features': ['sqrt', 'log2']
}

rf_model = RandomForestRegressor(random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=100,  
    cv=5, 
    scoring='neg_mean_squared_error', 
    n_jobs=-1,  
    verbose=2, 
    random_state=42 
)

random_search.fit(X_train, y_train)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Cross-Validated MSE: {-random_search.best_score_}")

best_rf_model = random_search.best_estimator_

y_pred = best_rf_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Optimized Mean Squared Error: {mse}")
print(f"Optimized R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 9, 'n_estimators': 300}
Best Cross-Validated MSE: 123.18656446929113
Optimized Mean Squared Error: 100.84344694932958
Optimized R-squared: 0.9867372371208559
Mean Absolute Error: 2.850470181925766
In [93]:
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

import seaborn as sns

plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [94]:
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve
import numpy as np

train_sizes, train_scores, test_scores = learning_curve(
    estimator=best_rf_model,  
    X=X_train, 
    y=y_train, 
    cv=5,
    scoring='neg_mean_squared_error', 
    n_jobs=-1,  
)

train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, label='Training Error', color='blue', marker='o')
plt.plot(train_sizes, test_scores_mean, label='Validation Error', color='green', marker='s')
plt.xlabel('Training Set Size')
plt.ylabel('Mean Squared Error')
plt.title('Learning Curve for Random Forest')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [95]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

gb_model = GradientBoostingRegressor(random_state=42)

gb_model.fit(X_train, y_train)

y_pred = gb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
Mean Squared Error: 26.598299155986606
R-squared: 0.9965018358121808
Mean Absolute Error: 2.602388853480325
In [96]:
cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validated MSE: {-cv_scores.mean()}")
Cross-validated MSE: 28.57541184512715
In [97]:
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()

import seaborn as sns

plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [98]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

param_dist = {
    'n_estimators': [400, 600, 800, 1000], 
    'max_depth': [3, 4, 5, 6], 
    'learning_rate': uniform(0.01, 0.03), 
    'subsample': uniform(0.7, 0.2),  
    'max_features': ['sqrt', 'log2', None], 
    'min_samples_split': [2, 4, 6], 
    'min_samples_leaf': [1, 2, 4]
}


gb_model = GradientBoostingRegressor(random_state=42)

random_search = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_dist,
    n_iter=100,  
    scoring='neg_mean_squared_error',
    cv=5, 
    n_jobs=-1,
    verbose=1,
    random_state=42
)

random_search.fit(X_train, y_train)

print(f"Best Parameters: {random_search.best_params_}")
print(f"Best Cross-Validated MSE: {-random_search.best_score_}")

best_gb_model = random_search.best_estimator_

y_pred = best_gb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Optimized Mean Squared Error: {mse}")
print(f"Optimized R-squared: {r2}")
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters: {'learning_rate': 0.03461918427231866, 'max_depth': 6, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000, 'subsample': 0.8810701283912127}
Best Cross-Validated MSE: 6.6972207975294555
Optimized Mean Squared Error: 6.964972264887826
Optimized R-squared: 0.999083978400149
In [9]:
print(f"RMSE (GBM): {sqrt(6.6972207975294555)}")
RMSE (GBM): 2.587898915632034
In [99]:
residuals = y_test - y_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_pred, residuals, alpha=0.6, color='purple')
plt.axhline(y=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
No description has been provided for this image
In [100]:
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, color='orange', bins=30)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2)
plt.xlabel('Residuals')
plt.title('Residual Distribution')
plt.show()
No description has been provided for this image
In [101]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, alpha=0.6, color='blue', label='Predicted vs. Actual')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2, label='Ideal Fit')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.legend()
plt.show()
No description has been provided for this image
In [102]:
from sklearn.model_selection import learning_curve
import numpy as np

train_sizes, train_scores, test_scores = learning_curve(
    estimator=best_gb_model, X=X_train, y=y_train, cv=5, scoring='neg_mean_squared_error')

train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.plot(train_sizes, train_scores_mean, label='Training Error', color='blue', marker='o')
plt.plot(train_sizes, test_scores_mean, label='Validation Error', color='green', marker='s')
plt.xlabel('Training Set Size')
plt.ylabel('Mean Squared Error')
plt.title('Learning Curve')
plt.legend()
plt.show()
No description has been provided for this image
In [111]:
# Deep Learning 
In [150]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from sklearn.preprocessing import MinMaxScaler
 
df6 = df5.copy()
df7 = df5.copy()
 
df6['Date'] = pd.to_datetime(df6['Date'])
df6['Date'] = (df6['Date'] - df6['Date'].min()) / pd.Timedelta(days=1)
 
target_column = 'COVID-19 Deaths'
 
X = df6.drop(columns=['Date', target_column])
y = df6[target_column]
 
target_size = len(df6) * 5  
current_size = len(X)
num_to_generate = target_size - current_size
 
def generate_interpolated_samples(X, y, num_samples):
    interpolated_X = []
    interpolated_y = []
    for _ in range(num_samples):
        idx1, idx2 = np.random.choice(len(X), size=2, replace=False)
        alpha = np.random.rand()
        new_sample_X = alpha * X.iloc[idx1] + (1 - alpha) * X.iloc[idx2]
        new_sample_y = alpha * y.iloc[idx1] + (1 - alpha) * y.iloc[idx2]
        interpolated_X.append(new_sample_X)
        interpolated_y.append(new_sample_y)
    return pd.DataFrame(interpolated_X, columns=X.columns), np.array(interpolated_y)
 
def add_random_noise(X, y, num_samples, noise_level=0.01):
    noisy_X = []
    noisy_y = []
    for _ in range(num_samples):
        idx = np.random.choice(len(X))
        new_sample_X = X.iloc[idx] + np.random.normal(0, noise_level, len(X.columns))
        new_sample_y = y.iloc[idx] + np.random.normal(0, noise_level)
        noisy_X.append(new_sample_X)
        noisy_y.append(new_sample_y)
    return pd.DataFrame(noisy_X, columns=X.columns), np.array(noisy_y)
 
num_interpolated = num_to_generate // 2
num_noisy = num_to_generate - num_interpolated
 
X_interpolated, y_interpolated = generate_interpolated_samples(X, y, num_interpolated)
X_noisy, y_noisy = add_random_noise(X, y, num_noisy)
 
X_augmented = pd.concat([X, X_interpolated, X_noisy], ignore_index=True)
y_augmented = np.hstack([y, y_interpolated, y_noisy])
 
df_augmented = X_augmented.copy()
df_augmented[target_column] = y_augmented
df_augmented['Date'] = pd.concat([df6['Date'], df6['Date']], ignore_index=True)
 
In [206]:
scaler_X = MinMaxScaler()
scaled_data_X = scaler_X.fit_transform(df_augmented.drop(columns=['Date', target_column]))
 
X_train, y_train = scaled_data_X[:-int(0.2 * len(scaled_data_X))], df_augmented[target_column].iloc[:-int(0.2 * len(df_augmented))]
X_test, y_test = scaled_data_X[-int(0.2 * len(scaled_data_X)):], df_augmented[target_column].iloc[-int(0.2 * len(df_augmented)):]
 
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
 
model = Sequential([
    LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
    Dropout(0.2),
    LSTM(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1)
])
 
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(X_train, y_train, validation_split=0.2, epochs=30, batch_size=32)
 
y_pred = model.predict(X_test)
 
Epoch 1/30
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

1954/1954 ━━━━━━━━━━━━━━━━━━━━ 8s 3ms/step - loss: 4284.4507 - mae: 28.7094 - val_loss: 603.2394 - val_mae: 13.0328
Epoch 2/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 688.1735 - mae: 14.7350 - val_loss: 244.2824 - val_mae: 7.9310
Epoch 3/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 427.4215 - mae: 10.9793 - val_loss: 153.3202 - val_mae: 6.2393
Epoch 4/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 350.9440 - mae: 9.3596 - val_loss: 151.7106 - val_mae: 6.3204
Epoch 5/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 270.3113 - mae: 8.4317 - val_loss: 112.6511 - val_mae: 5.8913
Epoch 6/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 257.1239 - mae: 8.0056 - val_loss: 120.6163 - val_mae: 6.3342
Epoch 7/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 232.8642 - mae: 7.5797 - val_loss: 102.2034 - val_mae: 5.4179
Epoch 8/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 217.2368 - mae: 7.3020 - val_loss: 90.6770 - val_mae: 4.9463
Epoch 9/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 221.9447 - mae: 7.1101 - val_loss: 120.1041 - val_mae: 5.6942
Epoch 10/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 194.8808 - mae: 6.7943 - val_loss: 76.6548 - val_mae: 5.4320
Epoch 11/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 184.7319 - mae: 6.6637 - val_loss: 161.5324 - val_mae: 5.4128
Epoch 12/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 197.6679 - mae: 6.5735 - val_loss: 66.7648 - val_mae: 5.2191
Epoch 13/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 176.8647 - mae: 6.3461 - val_loss: 77.2285 - val_mae: 4.5846
Epoch 14/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 182.1945 - mae: 6.2603 - val_loss: 69.7907 - val_mae: 4.4851
Epoch 15/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 154.3186 - mae: 6.0397 - val_loss: 62.3844 - val_mae: 4.3544
Epoch 16/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 167.0663 - mae: 6.0321 - val_loss: 54.4324 - val_mae: 4.1472
Epoch 17/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 171.3098 - mae: 5.9462 - val_loss: 72.6816 - val_mae: 5.4798
Epoch 18/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 147.4979 - mae: 5.7463 - val_loss: 64.2266 - val_mae: 4.3810
Epoch 19/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 166.7913 - mae: 5.7907 - val_loss: 86.1697 - val_mae: 4.7513
Epoch 20/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 169.6841 - mae: 5.8199 - val_loss: 250.4195 - val_mae: 8.9573
Epoch 21/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 6s 3ms/step - loss: 159.7842 - mae: 5.6700 - val_loss: 84.8684 - val_mae: 6.3130
Epoch 22/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 156.0296 - mae: 5.6346 - val_loss: 51.3129 - val_mae: 4.5952
Epoch 23/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 133.6155 - mae: 5.4115 - val_loss: 58.2049 - val_mae: 5.1100
Epoch 24/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 139.6471 - mae: 5.3920 - val_loss: 242.0278 - val_mae: 6.3070
Epoch 25/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 148.5286 - mae: 5.4414 - val_loss: 88.7733 - val_mae: 5.9347
Epoch 26/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 205.9672 - mae: 5.5292 - val_loss: 87.0075 - val_mae: 5.3966
Epoch 27/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 113.4149 - mae: 5.1389 - val_loss: 160.8645 - val_mae: 6.7774
Epoch 28/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 128.2009 - mae: 5.2293 - val_loss: 67.4726 - val_mae: 5.1023
Epoch 29/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 127.4477 - mae: 5.1347 - val_loss: 128.9587 - val_mae: 6.3164
Epoch 30/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 113.1685 - mae: 5.0757 - val_loss: 64.3566 - val_mae: 5.5248
611/611 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
In [207]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
Mean Squared Error: 66.14676826523599
R-squared: 0.9927328175627637
In [208]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

def create_model():
    model = Sequential([
        LSTM(64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
        Dropout(0.2),
        LSTM(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_mse = []

X = X_train  
y = y_train  

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Training fold {fold + 1}...")
    X_fold_train, X_fold_val = X[train_idx], X[val_idx]
    y_fold_train, y_fold_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = create_model()
    model.fit(X_fold_train, y_fold_train, epochs=30, batch_size=32, verbose=0) 
    
    y_pred = model.predict(X_fold_val)
    
    mse = mean_squared_error(y_fold_val, y_pred)
    fold_mse.append(mse)
    print(f"Fold {fold + 1} MSE: {mse}")

print(f"Average Cross-Validation MSE: {np.mean(fold_mse)}")
Training fold 1...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
Fold 1 MSE: 58.864196662395436
Training fold 2...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
Fold 2 MSE: 101.79963373143964
Training fold 3...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
Fold 3 MSE: 109.49627370901372
Training fold 4...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
Fold 4 MSE: 122.16994173663969
Training fold 5...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
Fold 5 MSE: 111.92000552857714
Average Cross-Validation MSE: 100.85001027361311
In [212]:
y_pred_rescaled = y_pred.flatten()
 
num_test_rows = int(0.2 * len(df7))
 
y_pred_rescaled_trimmed = y_pred_rescaled[-num_test_rows:]
 
if len(y_pred_rescaled_trimmed) != num_test_rows:
    raise ValueError(f"Trimmed predicted values' length ({len(y_pred_rescaled_trimmed)}) does not match the expected number of rows ({num_test_rows})")
 
target_column_index = df7.columns.get_loc(target_column)
 
df7.iloc[-num_test_rows:, target_column_index] = y_pred_rescaled_trimmed
In [214]:
df5_grouped = df5.groupby('Date')[target_column].mean()
df7_grouped = df7.groupby('Date')[target_column].mean()
 
plt.figure(figsize=(12, 6))
plt.plot(df5_grouped.index, df5_grouped.values, label='Original Data (df5)', color='blue', linewidth=2)
plt.plot(df7_grouped.index, df7_grouped.values, label='Modified Data (df7)', color='red', linestyle='--', linewidth=2)
plt.xlabel('Date')
plt.ylabel('COVID-19 Deaths')
plt.title('COVID-19 Deaths Comparison: Grouped by Date')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [225]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from kerastuner.tuners import RandomSearch

scaler_X = MinMaxScaler()
scaled_data_X = scaler_X.fit_transform(df_augmented.drop(columns=['Date', target_column]))

X_train, y_train = scaled_data_X[:-int(0.2 * len(scaled_data_X))], df_augmented[target_column].iloc[:-int(0.2 * len(df_augmented))]
X_test, y_test = scaled_data_X[-int(0.2 * len(scaled_data_X)):], df_augmented[target_column].iloc[-int(0.2 * len(df_augmented)):]

X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

def build_model(hp):
    model = Sequential([
        LSTM(
            units=hp.Int('units1', min_value=32, max_value=128, step=16),
            activation='relu',
            input_shape=(X_train.shape[1], X_train.shape[2]),
            return_sequences=True
        ),
        Dropout(hp.Float('dropout1', min_value=0.1, max_value=0.5, step=0.1)),
        LSTM(
            units=hp.Int('units2', min_value=16, max_value=64, step=16),
            activation='relu'
        ),
        Dropout(hp.Float('dropout2', min_value=0.1, max_value=0.5, step=0.1)),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(
        optimizer=hp.Choice('optimizer', ['adam', 'rmsprop']),
        loss='mse',
        metrics=['mae']
    )
    return model

tuner = RandomSearch(
    build_model,
    objective='val_mae',
    max_trials=10,  
    executions_per_trial=1,  
    directory='tuner_logs',  
    project_name='lstm_tuning'
)

tuner.search(X_train, y_train, validation_split=0.2, epochs=10, batch_size=32)

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best hyperparameter:units1={best_hps.get('units1')}, units2={best_hps.get('units2')}, "
      f"dropout1={best_hps.get('dropout1')}, dropout2={best_hps.get('dropout2')}, "
      f"optimizer={best_hps.get('optimizer')}")

best_model = tuner.hypermodel.build(best_hps)
best_model.fit(X_train, y_train, validation_split=0.2, epochs=30, batch_size=32)

y_pred = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"MSE:{mse}")
Reloading Tuner from tuner_logs\lstm_tuning\tuner0.json
Best hyperparameter:units1=96, units2=64, dropout1=0.1, dropout2=0.1, optimizer=rmsprop
Epoch 1/30
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

1954/1954 ━━━━━━━━━━━━━━━━━━━━ 7s 2ms/step - loss: 4762.8560 - mae: 28.7164 - val_loss: 401.6913 - val_mae: 10.0315
Epoch 2/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 351.7172 - mae: 9.9824 - val_loss: 165.0062 - val_mae: 6.2835
Epoch 3/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 180.0067 - mae: 6.9049 - val_loss: 154.5481 - val_mae: 7.8355
Epoch 4/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 145.4411 - mae: 6.1036 - val_loss: 114.0388 - val_mae: 4.5863
Epoch 5/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 124.6900 - mae: 5.5366 - val_loss: 87.4020 - val_mae: 3.9355
Epoch 6/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 103.0333 - mae: 5.1290 - val_loss: 86.1112 - val_mae: 4.8812
Epoch 7/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 101.6065 - mae: 4.9477 - val_loss: 66.2670 - val_mae: 3.4518
Epoch 8/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 91.5161 - mae: 4.7847 - val_loss: 68.5574 - val_mae: 3.7937
Epoch 9/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 92.4686 - mae: 4.6195 - val_loss: 55.3026 - val_mae: 4.2518
Epoch 10/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 83.9193 - mae: 4.4766 - val_loss: 93.0204 - val_mae: 5.1037
Epoch 11/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 73.8946 - mae: 4.3010 - val_loss: 43.4825 - val_mae: 3.1535
Epoch 12/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 75.3073 - mae: 4.2707 - val_loss: 33.1330 - val_mae: 2.5636
Epoch 13/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 68.0840 - mae: 4.0985 - val_loss: 42.8328 - val_mae: 3.3675
Epoch 14/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 64.2629 - mae: 4.0251 - val_loss: 49.7365 - val_mae: 3.4961
Epoch 15/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 71.6059 - mae: 3.9543 - val_loss: 33.0844 - val_mae: 3.3712
Epoch 16/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 2ms/step - loss: 55.7071 - mae: 3.8210 - val_loss: 27.5599 - val_mae: 2.5060
Epoch 17/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 58.9544 - mae: 3.7550 - val_loss: 33.5092 - val_mae: 3.0504
Epoch 18/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 49.7374 - mae: 3.6048 - val_loss: 18.8979 - val_mae: 1.8980
Epoch 19/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 53.1799 - mae: 3.5613 - val_loss: 19.2844 - val_mae: 1.9462
Epoch 20/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 50.5979 - mae: 3.4279 - val_loss: 31.2365 - val_mae: 2.2756
Epoch 21/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 48.3958 - mae: 3.4575 - val_loss: 20.1160 - val_mae: 2.1782
Epoch 22/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 51.4337 - mae: 3.4064 - val_loss: 20.3351 - val_mae: 2.4709
Epoch 23/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 48.2273 - mae: 3.3874 - val_loss: 16.4264 - val_mae: 1.9576
Epoch 24/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 42.6366 - mae: 3.2962 - val_loss: 13.0901 - val_mae: 1.9384
Epoch 25/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 45.1562 - mae: 3.2402 - val_loss: 15.1436 - val_mae: 1.9503
Epoch 26/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 45.7242 - mae: 3.2364 - val_loss: 13.3391 - val_mae: 1.7806
Epoch 27/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 53.2993 - mae: 3.2616 - val_loss: 31.7279 - val_mae: 3.2128
Epoch 28/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 43.9514 - mae: 3.1336 - val_loss: 13.9642 - val_mae: 1.7350
Epoch 29/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 10s 2ms/step - loss: 43.7534 - mae: 3.1034 - val_loss: 14.1895 - val_mae: 1.8326
Epoch 30/30
1954/1954 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 42.4841 - mae: 3.0597 - val_loss: 13.0012 - val_mae: 1.6620
611/611 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
MSE:12.771035268984182
In [227]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
Mean Squared Error: 12.771035268984182
R-squared: 0.9985969164383067
In [11]:
print(f"RMSE (LSTM): {sqrt(12.771035268984182)}")
RMSE (LSTM): 3.5736585271936914
In [231]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

def create_model():
    model = Sequential([
        LSTM(96, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
        Dropout(0.1),
        LSTM(64, activation='relu'),
        Dropout(0.1),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    model.compile(optimizer='rmsprop', loss='mse', metrics=['mae'])
    return model

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_mse = []

X = X_train  
y = y_train  

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Training fold {fold + 1}...")
    X_fold_train, X_fold_val = X[train_idx], X[val_idx]
    y_fold_train, y_fold_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = create_model()
    model = tuner.hypermodel.build(best_hps)
    model.fit(X_fold_train, y_fold_train, epochs=20, batch_size=32, verbose=0) 
    
    y_pred = model.predict(X_fold_val)
    
    mse = mean_squared_error(y_fold_val, y_pred)
    fold_mse.append(mse)
    print(f"Fold {fold + 1} MSE: {mse}")

print(f"Average Cross-Validation MSE: {np.mean(fold_mse)}")
Training fold 1...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
Fold 1 MSE: 32.09960144679565
Training fold 2...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
Fold 2 MSE: 14.963886774129815
Training fold 3...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
Fold 3 MSE: 60.439355601183664
Training fold 4...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
Fold 4 MSE: 19.8639216633464
Training fold 5...
C:\Users\chaoh\anaconda3\Lib\site-packages\keras\src\layers\rnn\rnn.py:204: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

489/489 ━━━━━━━━━━━━━━━━━━━━ 1s 2ms/step
Fold 5 MSE: 22.022435886051785
Average Cross-Validation MSE: 29.877840274301462
In [233]:
y_pred_rescaled = y_pred.flatten()
 
num_test_rows = int(0.2 * len(df7))
 
y_pred_rescaled_trimmed = y_pred_rescaled[-num_test_rows:]
 
if len(y_pred_rescaled_trimmed) != num_test_rows:
    raise ValueError(f"Trimmed predicted values' length ({len(y_pred_rescaled_trimmed)}) does not match the expected number of rows ({num_test_rows})")
 
target_column_index = df7.columns.get_loc(target_column)
 
df7.iloc[-num_test_rows:, target_column_index] = y_pred_rescaled_trimmed
In [235]:
df5_grouped = df5.groupby('Date')[target_column].mean()
df7_grouped = df7.groupby('Date')[target_column].mean()
 
plt.figure(figsize=(12, 6))
plt.plot(df5_grouped.index, df5_grouped.values, label='Original Data (df5)', color='blue', linewidth=2)
plt.plot(df7_grouped.index, df7_grouped.values, label='Modified Data (df7)', color='red', linestyle='--', linewidth=2)
plt.xlabel('Date')
plt.ylabel('COVID-19 Deaths')
plt.title('COVID-19 Deaths Comparison: Grouped by Date')
plt.legend()
plt.grid(True)
plt.show()
No description has been provided for this image
In [ ]: